Link to recording: https://smu.zoom.us/rec/share/Gujqwi30bZtZpix1Wlj58pEX7dx37r-WDR0gRYhTein89fP_C1e9dtbBbIxzKQOd.slY31p9lFCdaozCi?startTime=1666477030000 Passcode: !kH

Load in data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.7     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
beers <- data.frame(read.csv('Beers.csv', stringsAsFactors = FALSE))
breweries <- data.frame(read.csv('Breweries.csv', stringsAsFactors = FALSE))
  1. How many breweries are present in each state?
num_breweries <- breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
geom_text(stat='count', aes(label=..count..), vjust=-1)
num_breweries

Colorado has the largest amount of breweries.

  1. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file. (RMD only, this does not need to be included in the presentation or the deck.)
head(beers)
head(breweries)
breweries <- breweries %>% rename("Brewery_id" = "Brew_ID")
bb <- merge(beers, breweries, by="Brewery_id", all=TRUE)
bb <- bb %>% rename("Beer" = "Name.x") %>% rename("Brewery" = "Name.y")
print(head(bb))
##   Brewery_id          Beer Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces            Brewery        City
## 1                        American IPA     16 NorthGate Brewing  Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing  Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing  Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing  Minneapolis
## 5                     American Porter     16 NorthGate Brewing  Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing  Minneapolis
##   State
## 1    MN
## 2    MN
## 3    MN
## 4    MN
## 5    MN
## 6    MN
print(tail(bb))
##      Brewery_id                      Beer Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                       Brewery          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK
  1. Address the missing values in each column.
summary(bb)
##    Brewery_id        Beer              Beer_ID            ABV         
##  Min.   :  1.0   Length:2410        Min.   :   1.0   Min.   :0.00100  
##  1st Qu.: 94.0   Class :character   1st Qu.: 808.2   1st Qu.:0.05000  
##  Median :206.0   Mode  :character   Median :1453.5   Median :0.05600  
##  Mean   :232.7                      Mean   :1431.1   Mean   :0.05977  
##  3rd Qu.:367.0                      3rd Qu.:2075.8   3rd Qu.:0.06700  
##  Max.   :558.0                      Max.   :2692.0   Max.   :0.12800  
##                                                      NA's   :62       
##       IBU            Style               Ounces        Brewery         
##  Min.   :  4.00   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 21.00   Class :character   1st Qu.:12.00   Class :character  
##  Median : 35.00   Mode  :character   Median :12.00   Mode  :character  
##  Mean   : 42.71                      Mean   :13.59                     
##  3rd Qu.: 64.00                      3rd Qu.:16.00                     
##  Max.   :138.00                      Max.   :32.00                     
##  NA's   :1005                                                          
##      City              State          
##  Length:2410        Length:2410       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
# bb[is.na(bb)] <- 0
library(plyr)
## Warning: package 'plyr' was built under R version 4.1.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
impute.mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
bb <- ddply(bb, ~ Style, transform, ABV = impute.mean(ABV), IBU = impute.mean(IBU))
bb[is.na(bb)] <- 0
summary(bb)
##    Brewery_id        Beer              Beer_ID            ABV         
##  Min.   :  1.0   Length:2410        Min.   :   1.0   Min.   :0.00100  
##  1st Qu.: 94.0   Class :character   1st Qu.: 808.2   1st Qu.:0.05000  
##  Median :206.0   Mode  :character   Median :1453.5   Median :0.05650  
##  Mean   :232.7                      Mean   :1431.1   Mean   :0.05975  
##  3rd Qu.:367.0                      3rd Qu.:2075.8   3rd Qu.:0.06700  
##  Max.   :558.0                      Max.   :2692.0   Max.   :0.12800  
##       IBU            Style               Ounces        Brewery         
##  Min.   :  0.00   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 21.00   Class :character   1st Qu.:12.00   Class :character  
##  Median : 33.41   Mode  :character   Median :12.00   Mode  :character  
##  Mean   : 39.98                      Mean   :13.59                     
##  3rd Qu.: 57.00                      3rd Qu.:16.00                     
##  Max.   :138.00                      Max.   :32.00                     
##      City              State          
##  Length:2410        Length:2410       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
  1. Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
library(ggplot2)
bb %>% ggplot(aes(x = as.factor(State), y = ABV, fill = State)) + geom_bar(stat = "summary", fun = "median") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Median ABV")

bb %>% ggplot(aes(x = as.factor(State), y = IBU, fill = State)) + geom_bar(stat = "summary", fun = "median") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Median IBU")

ME has both the highest median ABV and the highest median IBU of all the states. There is a lot of overlap between median ABV and median IBU.

  1. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
paste0("Max ABV State:", bb$State[which.max(bb$ABV)])
## [1] "Max ABV State: CO"
bb %>% slice_max(ABV)
paste0("Max IBU State:", bb$State[which.max(bb$IBU)])
## [1] "Max IBU State: OR"

The max ABV states is Colorado, and the max IBU state is Oregon.

  1. Comment on the summary statistics and distribution of the ABV variable.
summary(bb$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00100 0.05000 0.05650 0.05975 0.06700 0.12800
bb %>% ggplot(aes(x = ABV)) + geom_histogram() + ggtitle("Histogram of ABV")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The min is .1%, the max is 12.8%, and the median is 5.65%. This shows that most Americans prefer beer than is around the 5%-6% ABV. 7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.

bb %>% ggplot(aes(x = IBU, y = ABV)) + geom_smooth(method = "lm") + geom_point() + ggtitle("ABV vs IBU")
## `geom_smooth()` using formula 'y ~ x'

The linear regression line shows a clear correlation between ABV and IBU. As ABV increases, so does IBU.

  1. Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually. In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.
unique(bb$Style)
##   [1] ""                                   
##   [2] "Abbey Single Ale"                   
##   [3] "Altbier"                            
##   [4] "American Adjunct Lager"             
##   [5] "American Amber / Red Ale"           
##   [6] "American Amber / Red Lager"         
##   [7] "American Barleywine"                
##   [8] "American Black Ale"                 
##   [9] "American Blonde Ale"                
##  [10] "American Brown Ale"                 
##  [11] "American Dark Wheat Ale"            
##  [12] "American Double / Imperial IPA"     
##  [13] "American Double / Imperial Pilsner" 
##  [14] "American Double / Imperial Stout"   
##  [15] "American India Pale Lager"          
##  [16] "American IPA"                       
##  [17] "American Malt Liquor"               
##  [18] "American Pale Ale (APA)"            
##  [19] "American Pale Lager"                
##  [20] "American Pale Wheat Ale"            
##  [21] "American Pilsner"                   
##  [22] "American Porter"                    
##  [23] "American Stout"                     
##  [24] "American Strong Ale"                
##  [25] "American White IPA"                 
##  [26] "American Wild Ale"                  
##  [27] "Baltic Porter"                      
##  [28] "Belgian Dark Ale"                   
##  [29] "Belgian IPA"                        
##  [30] "Belgian Pale Ale"                   
##  [31] "Belgian Strong Dark Ale"            
##  [32] "Belgian Strong Pale Ale"            
##  [33] "Berliner Weissbier"                 
##  [34] "Bière de Garde"                    
##  [35] "Bock"                               
##  [36] "Braggot"                            
##  [37] "California Common / Steam Beer"     
##  [38] "Chile Beer"                         
##  [39] "Cider"                              
##  [40] "Cream Ale"                          
##  [41] "Czech Pilsener"                     
##  [42] "Doppelbock"                         
##  [43] "Dortmunder / Export Lager"          
##  [44] "Dubbel"                             
##  [45] "Dunkelweizen"                       
##  [46] "English Barleywine"                 
##  [47] "English Bitter"                     
##  [48] "English Brown Ale"                  
##  [49] "English Dark Mild Ale"              
##  [50] "English India Pale Ale (IPA)"       
##  [51] "English Pale Ale"                   
##  [52] "English Pale Mild Ale"              
##  [53] "English Stout"                      
##  [54] "English Strong Ale"                 
##  [55] "Euro Dark Lager"                    
##  [56] "Euro Pale Lager"                    
##  [57] "Extra Special / Strong Bitter (ESB)"
##  [58] "Flanders Oud Bruin"                 
##  [59] "Flanders Red Ale"                   
##  [60] "Foreign / Export Stout"             
##  [61] "Fruit / Vegetable Beer"             
##  [62] "German Pilsener"                    
##  [63] "Gose"                               
##  [64] "Grisette"                           
##  [65] "Hefeweizen"                         
##  [66] "Herbed / Spiced Beer"               
##  [67] "Irish Dry Stout"                    
##  [68] "Irish Red Ale"                      
##  [69] "Kölsch"                            
##  [70] "Keller Bier / Zwickel Bier"         
##  [71] "Kristalweizen"                      
##  [72] "Light Lager"                        
##  [73] "Low Alcohol Beer"                   
##  [74] "Märzen / Oktoberfest"              
##  [75] "Maibock / Helles Bock"              
##  [76] "Mead"                               
##  [77] "Milk / Sweet Stout"                 
##  [78] "Munich Dunkel Lager"                
##  [79] "Munich Helles Lager"                
##  [80] "Oatmeal Stout"                      
##  [81] "Old Ale"                            
##  [82] "Other"                              
##  [83] "Pumpkin Ale"                        
##  [84] "Quadrupel (Quad)"                   
##  [85] "Radler"                             
##  [86] "Rauchbier"                          
##  [87] "Roggenbier"                         
##  [88] "Russian Imperial Stout"             
##  [89] "Rye Beer"                           
##  [90] "Saison / Farmhouse Ale"             
##  [91] "Schwarzbier"                        
##  [92] "Scotch Ale / Wee Heavy"             
##  [93] "Scottish Ale"                       
##  [94] "Shandy"                             
##  [95] "Smoked Beer"                        
##  [96] "Tripel"                             
##  [97] "Vienna Lager"                       
##  [98] "Wheat Ale"                          
##  [99] "Winter Warmer"                      
## [100] "Witbier"
bb$Class1 <- NA
for(i in 1:nrow(bb)){
  if(grepl("IPA", bb$Style[i]) == TRUE | grepl("India Pale Ale", bb$Style[i]) == TRUE){
    bb$Class1[i] <- "IPA"
  } else if(grepl("Ale", bb$Style[i]) == TRUE & !grepl("India Pale Ale", bb$Style[i]) == TRUE & !grepl("IPA", bb$Style[i]) == TRUE){
    bb$Class1[i] <- "Ale"
  } else{
    bb$Class1[i] <- NA
  }
}
library(class)
## Warning: package 'class' was built under R version 4.1.3
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
set.seed(1234)
bb_knn_df <- na.omit(bb)
splitPerc <- 0.80
trainIndices <- sample(1:dim(bb_knn_df)[1], round(splitPerc * dim(bb_knn_df)[1]))
train <- bb_knn_df[trainIndices,]
test <- bb_knn_df[-trainIndices,]
train$ABV <- scale(train$ABV)
train$IBU <- scale(train$IBU)
test$ABV <- scale(test$ABV)
test$IBU <- scale(test$IBU)
accuracy <- c()
k <- c()
# Find maximum accuracy and its respective k value
for(i in 1:300){
  classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = i)
  CM <- confusionMatrix(table(classifications, test$Class1))
  accuracy[i] <- CM$overall[1]
  k[i] <- i
  if(i == 1){
    max_k <- i
    accuracy_max <- accuracy[i]
  }else if(accuracy[i] >= accuracy_max){
    max_k <- i
    accuracy_max <- accuracy[i]
  }
}
plot(k, accuracy, type = "l", xlab = "k")

print(paste0("Max k:", max_k))
## [1] "Max k:17"
print(paste0("Max accuracy:", accuracy[max_k]))
## [1] "Max accuracy:0.899022801302932"
classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
  CM <- confusionMatrix(table(classifications, test$Class1))
  print(CM)
## Confusion Matrix and Statistics
## 
##                
## classifications Ale IPA
##             Ale 184  12
##             IPA  19  92
##                                           
##                Accuracy : 0.899           
##                  95% CI : (0.8597, 0.9304)
##     No Information Rate : 0.6612          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7782          
##                                           
##  Mcnemar's Test P-Value : 0.2812          
##                                           
##             Sensitivity : 0.9064          
##             Specificity : 0.8846          
##          Pos Pred Value : 0.9388          
##          Neg Pred Value : 0.8288          
##              Prevalence : 0.6612          
##          Detection Rate : 0.5993          
##    Detection Prevalence : 0.6384          
##       Balanced Accuracy : 0.8955          
##                                           
##        'Positive' Class : Ale             
## 

The accuracy is 0.8893. This is rather good for a KNN model. The sensitivity and specificity are also high. Also, the pos pred value is high meaning that there are a lot of true values that were predicted.

library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
x_test <- test %>% select("ABV", "IBU")
y_test <- test %>% select("Class1")
yscore <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
yscore <- attributes(yscore)$prob

pdb <- cbind(x_test, y_test)
pdb <- cbind(pdb, yscore)

fig <- plot_ly(data = pdb,x = ~IBU, y = ~ABV, type = 'scatter', mode = 'markers',color = ~yscore, colors = 'RdBu', symbol = ~Class1, split = ~Class1, symbols = c('square-dot','circle-dot'), marker = list(size = 12, line = list(color = 'black', width = 1)))
fig

The above plot shows the divisions between the IPAs and Ales graphically. The index shows the probability/the intesisty which the beer was categorized.

p3<-ggplot(train,aes(x=IBU,y=ABV,colour=Class1)) + geom_jitter() +geom_density2d() + ggtitle("Density Plot of ABV vs IBU")
p3

  1. Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in. You must convince them why it is important and back up your conviction with appropriate statistical evidence.
fit.IBU <- aov(IBU~Style, data = bb)
print("ANOVA IBU")
## [1] "ANOVA IBU"
summary(fit.IBU)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## Style         99 1186184   11982   116.1 <2e-16 ***
## Residuals   2310  238303     103                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit.ABV <- aov(ABV~Style, data = bb)
print("ANOVA ABV")
## [1] "ANOVA ABV"
summary(fit.ABV)
##               Df Sum Sq   Mean Sq F value Pr(>F)    
## Style         99 0.2754 0.0027822   40.15 <2e-16 ***
## Residuals   2310 0.1601 0.0000693                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is significant evidence to prove that at least one Style has a different IBU, and one style has a different ABV (p-value < 2e-16).As a result, one can infer that different styles are characterized by different combinations of bitterness and alcohol level. The plot below shows that there are different ABV and IBU levels for each style of beer. More so, there appears to be a greater correlation between style and IBU than style and ABV.

bb %>% ggplot(aes(x = IBU, y = ABV, color = Style)) + geom_point() + theme(legend.position = "None") + ggtitle("ABV vs IBU by Style")